import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# reproduction purpose
1) torch.manual_seed(
<torch._C.Generator at 0x7ff4203ac150>
July 26, 2021
The full notebook is available here.
Word embeddings are dense vectors of real numbers, one for each word in the vocabulary, which is the collection of words extracted from the dataset.
There are many ways to represent a word on a computer. For example, we can use ASCII code. Yet, it only tells what the word is, not its meaning. Another option is to use a one-hot vector to represent a word, in which we put the number 1 in the location of the represented word. However, using a one-hot vector has 2 main drawbacks. First of all, the vector is huge and sparse. The size of the vector is the same as the size of the vocabulary. The vector is sparse since there is only one position that has a non-zero value. Next, it treats all words independently, with no relation to each other. Technically said it does not provide any notion of similarity between words.
Take an example from Pytorch documentation:
Suppose we are building a language model. Suppose we have seen the sentences.
In the training data, suppose we get the sentence never seen before:
That infers the physicist is a good fit in the new unseen sentence. That’s what we mean by semantic similarity. That relies on the assumption that: words appearing in similar contexts are related to each other semantically
To encode the similarity between words we can think up some semantic attributes. We then put those attributes in the vector and give a score for each. We give high scores for shared attributes and low scores for the counterparts. We can measure the similarity between two word vectors using the dot product. As a result, similar words will have a similarity score near 1 and different words will have a similarity score near zero.
Since thinking of the semantic attributes is hard and manually intensive, we can let them be the parameters in the network and be updated during training. One drawback of doing this way is the attribute scores are not interpretable. That is, we do not know what is attribute that a specific score represents.
In summary, word embeddings are a representation of the semantics of a word, efficiently encoding semantic information that might be relevant to the task at hand
To read more about word embeddings in Pytorch, click here
Word2Vec is one the approaches to develop a word embedding. There are two algorithms used in Word2Vec: continuous bag-of-words (CBOW) and skip-gram. CBOW aims to predict a center word from the surrounding context in terms of word vectors. Skip-gram does the opposite, and predicts the probability of context words from a center word. In this post, we will try to implement both.
The algorithm aims to predict a center word give the surrounding context in terms of word vectors. For example, given a sentence “The cat jumped over the puddle”, the algorithm treats {“The”, “cat”, “over”, “the”, “puddle”} as context words and {“jumped”} as the center word. The objective is to generate the center word from context words.
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# reproduction purpose
torch.manual_seed(1)
<torch._C.Generator at 0x7ff4203ac150>
CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right
EMBEDDING_DIM = 10
HIDDEN_DIM = 128
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()
# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
context = [raw_text[i - 2], raw_text[i - 1],
raw_text[i + 1], raw_text[i + 2]]
target = raw_text[i]
data.append((context, target))
class CBOW(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super(CBOW, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.linear1 = nn.Linear(embedding_dim, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, vocab_size)
def forward(self, inputs):
out = self.embeddings(inputs)
out = torch.sum(out, dim=0).view(1, -1)
out = F.relu(self.linear1(out))
out = F.log_softmax(self.linear2(out), dim=-1)
return out
# create your model and train. here are some functions to help you make
# the data ready for use by your module
def make_context_vector(context, word_to_ix):
idxs = [word_to_ix[w] for w in context]
return torch.tensor(idxs, dtype=torch.long)
# Training
losses = []
loss_function = nn.NLLLoss()
model = CBOW(vocab_size, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.001)
for epoch in range(10):
total_loss = 0
for context, target in data:
context_idxs = make_context_vector(context, word_to_ix)
model.zero_grad()
log_probs = model(context_idxs)
loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
loss.backward()
optimizer.step()
total_loss += loss.item()
losses.append(total_loss)
print(losses)
print(model.embeddings.weight[word_to_ix['spirits']])
[238.0529305934906, 233.41328835487366, 228.94981503486633, 224.64973831176758, 220.50258708000183, 216.49783158302307, 212.62398993968964, 208.87176704406738, 205.23141360282898, 201.69729340076447]
tensor([-0.7098, -0.6179, -0.3807, 2.3069, -0.7957, 1.4458, 0.6856, 2.1891,
-0.2936, 0.5549], grad_fn=<SelectBackward>)
CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right
EMBEDDING_DIM = 10
HIDDEN_DIM = 128
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()
# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
context = [raw_text[i - 2], raw_text[i - 1],
raw_text[i + 1], raw_text[i + 2]]
target = raw_text[i]
for value in context:
sample = (target, value)
data.append(sample)
class SkipGram(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super(SkipGram, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.linear1 = nn.Linear(embedding_dim, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, vocab_size)
def forward(self, inputs):
out = self.embeddings(inputs)
out = F.relu(self.linear1(out))
out = F.log_softmax(self.linear2(out), dim=-1)
return out
# create your model and train. here are some functions to help you make
# the data ready for use by your module
# Training
losses = []
loss_function = nn.NLLLoss()
model = SkipGram(vocab_size, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.001)
for epoch in range(10):
total_loss = 0
for input, output in data:
input_index = torch.tensor([word_to_ix[input]], dtype=torch.long)
model.zero_grad()
log_probs = model(input_index)
loss = loss_function(log_probs, torch.tensor([word_to_ix[output]], dtype=torch.long))
loss.backward()
optimizer.step()
total_loss += loss.item()
losses.append(total_loss)
print(losses)
print(model.embeddings.weight[word_to_ix['spirits']])
[912.227198600769, 903.7498636245728, 895.8144631385803, 888.3975474834442, 881.4723126888275, 875.0154674053192, 869.0011675357819, 863.3996088504791, 858.1725707054138, 853.288257598877]
tensor([ 0.3021, 0.2816, -1.1773, 1.0418, 1.8390, -0.5845, -0.2637, 1.3842,
0.3855, 0.1923], grad_fn=<SelectBackward>)
Gensim implements CBOW and using negative sampling for training by default. To toggle between CBOW and skip-gram algorithm, add this argument below when create the Word2Vec instance.
sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
from gensim.test.utils import datapath
from gensim import utils
class MyCorpus:
"""An iterator that yields sentences"""
def __iter__(self):
corpus_path = datapath('lee_background.cor')
for line in open(corpus_path):
# assume there is one document per line, tokens separated by whitespace
yield utils.simple_preprocess(line)
array([-1.47548895e-02, 4.44000289e-02, 1.02321925e-02, 1.20065575e-02,
9.83571820e-03, -8.47978592e-02, 3.42624560e-02, 8.44758376e-02,
-3.13533121e-03, -1.38494289e-02, -4.28904686e-03, -5.30756600e-02,
7.55382003e-03, 2.79652104e-02, 4.44820989e-03, 1.32240532e-02,
-2.42202985e-03, -2.49751448e-03, -1.71462744e-02, -6.11230545e-02,
3.83632220e-02, 9.09661502e-03, 1.09449634e-02, -2.17360468e-03,
-1.88374687e-02, 2.02645455e-02, -1.86126940e-02, -1.27745485e-02,
-2.71721575e-02, 1.31690372e-02, 3.29722501e-02, -4.22514454e-02,
3.72793637e-02, -3.36719528e-02, -7.06554204e-03, 4.73929197e-02,
1.39981424e-02, 7.61039788e-03, -1.61971990e-02, -3.04519087e-02,
-1.60803776e-02, 4.38297074e-03, -8.02283920e-03, 1.50885303e-02,
2.63876691e-02, -1.95540637e-02, -2.64777783e-02, -3.67977191e-04,
7.01137306e-03, 3.12562287e-02, 1.64159592e-02, -2.16274485e-02,
-1.62629951e-02, 8.53445439e-04, -1.33869080e-02, 1.73475724e-02,
-1.21692673e-03, 2.21166899e-03, -2.24457402e-02, 4.26836731e-03,
-1.45576373e-02, 6.20996347e-04, 6.98805647e-03, -4.57839714e-03,
-2.95367688e-02, 6.10822700e-02, 1.47746662e-02, 3.35532837e-02,
-3.87191810e-02, 4.92215976e-02, -1.04450071e-02, 2.97265081e-03,
5.04135974e-02, -8.13318323e-03, 3.63118313e-02, 2.79957112e-02,
-1.12850778e-03, -2.14369707e-02, -4.13609855e-02, -1.58206820e-02,
-3.22486572e-02, 7.98239373e-03, -3.16767953e-02, 4.03956585e-02,
-3.79999110e-05, -1.51074128e-02, 2.10159868e-02, 3.33536156e-02,
4.75050472e-02, 1.45110274e-02, 3.53002362e-02, 5.23244813e-02,
4.45592292e-02, 9.90339927e-03, 8.80143940e-02, 2.01153327e-02,
4.54641357e-02, -4.78953496e-03, 7.65400566e-03, -5.82322525e-03],
dtype=float32)
Word2Vec is unsupervised task, so there is no good way to evaluate the result. Evaluation depends on the application.
[=================================================-] 99.8% 1660.2/1662.8MB downloaded
for index , word in enumerate(wv.index_to_key):
if index == 10:
break
print(f"word #{index}/{len(wv.index_to_key)} is {word}")
word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said
One limitation of Word2Vec is that the model is unable to infer vectors for unseen words.
Note: FastText model can solve this limitation.
try:
vec_random = wv['vietname']
except KeyError:
print("The word 'vietname' does not appear in this model")
The word 'vietname' does not appear in this model
pairs = [
('scooter', 'chair'),
('scooter', 'motorbike'),
('scooter', 'football')
]
for w1, w2 in pairs:
print(f'{w1}\t{w2}\t{wv.similarity(w1, w2)}')
scooter chair 0.20833881199359894
scooter motorbike 0.7071131467819214
scooter football 0.07120829075574875
[('ww2', 0.6164373159408569), ('iraq', 0.6033741235733032), ('reagan', 0.5772603154182434), ('VietNam', 0.5732988119125366), ('afghanistan', 0.5602078437805176)]
Visualization can be used to notice semantic and syntactic trends in the data.
%matplotlib inline
from sklearn.decomposition import IncrementalPCA # inital reduction
from sklearn.manifold import TSNE # final reduction
import numpy as np # array handling
def reduce_dimensions(model):
num_dimensions = 2 # final num dimensions (2D, 3D, etc)
# extract the words & their vectors, as numpy arrays
vectors = np.asarray(model.wv.vectors)
labels = np.asarray(model.wv.index_to_key) # fixed-width numpy strings
# reduce using t-SNE
tsne = TSNE(n_components=num_dimensions, random_state=0)
vectors = tsne.fit_transform(vectors)
x_vals = [v[0] for v in vectors]
y_vals = [v[1] for v in vectors]
return x_vals, y_vals, labels
x_vals, y_vals, labels = reduce_dimensions(model)
def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
data = [trace]
if plot_in_notebook:
init_notebook_mode(connected=True)
iplot(data, filename='word-embedding-plot')
else:
plot(data, filename='word-embedding-plot.html')
def plot_with_matplotlib(x_vals, y_vals, labels):
import matplotlib.pyplot as plt
import random
random.seed(0)
plt.figure(figsize=(12, 12))
plt.scatter(x_vals, y_vals)
#
# Label randomly subsampled 25 data points
#
indices = list(range(len(labels)))
selected_indices = random.sample(indices, 25)
for i in selected_indices:
plt.annotate(labels[i], (x_vals[i], y_vals[i]))
try:
get_ipython()
except Exception:
plot_function = plot_with_matplotlib
else:
plot_function = plot_with_plotly
plot_function(x_vals, y_vals, labels)